Drp1 isoform analysis with UMI-deduplication

Preprossing pipeline

Base calling and demultiplexing

I run dorado/0.7.0 basecalling (enable barcoding detection with --kit-name will automatic trim primer and barcode, so --no-trim has no effect).

dorado basecaller sup $pod5 --kit-name SQK-NBD114-24 --no-trim > ${dir}/basecalled_reads.bam
dorado summary ${dir}/basecalled_reads.bam >  ${dir}/basecalled_summary.txt

Demux into separate files and then fastq files.

dorado demux --threads 16 --output-dir ${dir}/demux --no-classify  ${dir}/basecalled_reads.bam

Run Flexiplex to remove barcode and UMI

  • 1, First step to get UMI and attach to the beginning of read.
  • 2, Second step to remove barcode.
  • 3, Then need to move UMI to the end of read.
  • Flexiplex is able to get UMI in ~80-90% reads.
zcat ${raw_reads} | \
flexiplex -x $TSO_REV_CMP -u $UMI_REV_CMP -x $UMI_LEFT_FLANK_REV_CMP \
  -b "" -k "?" \
  -f 2 -e 1 -p 8 -n "UMI_extraction" | \
flexiplex -x $BC_LEFT_FLANK_FWD -b $BC_SEQ -x $BC_RIGHT_FLANK_FWD -k "?" -f 1 -p 8 -n "BC_extraction"  | \
sed 's/\(\?_#?\)\(_[^#]*\)\(#.*\)/\1\3\2/' > ${base}_fp.fastq

Run UMI-tools to deduplicate UMI

  • Map to mouse NCBI (GRCm39) or human (GRCh38) genome using the code below.
  • Then UMI-tools to remove duplicates based on UMI and coordinate (only look at the chromosome with DNM1L).
paftools.js gff2bed $gtf > ${name}_anno.bed
minimap2 -ax splice -I 1000G -t $num_cores --junc-bed ${name}_anno.bed $genome $raw_reads | samtools sort -@ $num_cores -O BAM -o ${name}_sorted.bam
samtools index -@ $num_cores ${name}_sorted.bam
umi_tools dedup -I ${name}_sorted.bam --chrom=${chr} --output-stats=${name} --edit-distance-threshold=2 --method=directional -S ${name}_dedup.bam

Merge bam files and then run Bambu

Run Bambu with gencode annotation (NCBI Refseq GCA_000001405.15 for human and GCF_000001635.27 for mouse) with NDR=0.2 (relative strigent detection of novel isoform).

samtools merge -o Drp1_human_dge/merged/${name}_sorted_dedup_merged.bam ${run1} ${run2}
samtools merge -o Drp1_mouse_dge/merged/${name}_sorted_dedup_merged.bam ${run1} ${run2}

Rscript --vanilla bambu_analysis.R Drp1_human_dge/merged
Rscript --vanilla bambu_analysis.R Drp1_mouse_dge/merged

Visualization of prelimary results

Code
library(ggfortify)
library(tidyverse)
library(bambu)
library(pheatmap)
library(DT)
library(ggh4x)
library(cowplot)
library(ComplexHeatmap)
library(RColorBrewer)

Mouse data, initial QC stats

Code
se1 <- readRDS("Drp1_mouse_dge/merged/default/se1.rds")[, -c(9:10)]
group <- factor(rep(c('Brain','Lung','Spleen','Heart',#'Liver',
                      'Kidney','Muscle'), each = 2))
Code
library.size <- colSums(assays(se1)$counts)
gene.count <- colSums(assays(se1)$count[rowData(se1)$GENEID == 'Dnm1l',])
capture.rate <- gene.count/library.size
# hist(capture.rate, main = 'capture rate in chr16')

anno.col <- data.frame(capture.rate = capture.rate,
                       count_DNM1L = gene.count,
                       count_chr16 = library.size,
                       group = group)
write.csv(anno.col, 'mouse_stats.csv')

anno.col %>% 
  pivot_longer(1:3) %>% 
  ggplot(aes(x = group, y = value, col = group)) +
  geom_point() +
  facet_wrap(.~name, scales = 'free_y') +
  scale_color_brewer(palette = "Set2") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Fig1 Sequencing stats of each sample
Code
p1 <- anno.col %>% select(capture.rate, group) %>%
  ggplot(aes(x = group, y = capture.rate, col = group)) +
  geom_point(size = 4) +
  scale_color_brewer(palette = "Set2") +
    labs(
    title = "Capture Rate in Chr12",
    x = "Group",
    y = "Proportion",
    color = "Group" # Clear legend title
  ) + 
  scale_y_continuous(limits = c(0, NA)) +
  geom_hline(yintercept = 1, linetype = 'dashed', col = 'grey70') +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        legend.position = "bottom")

p2 <- anno.col %>% select(count_DNM1L, group) %>%
  ggplot(aes(x = group, y = count_DNM1L/1e3, col = group)) +
  geom_point(size = 4) +
  scale_color_brewer(palette = "Set2") +
    labs(
    title = "DNM1L Counts",
    x = "Group",
    y = "Counts (*1000)",
    color = "Group" # Clear legend title
  ) + 
  scale_y_continuous(limits = c(0, NA)) +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        legend.position = "bottom")

p3 <- anno.col %>% select(count_chr16, group) %>%
  ggplot(aes(x = group, y = count_chr16/1e3, col = group)) +
  geom_point(size = 4) +
  scale_color_brewer(palette = "Set2") +
    labs(
    title = "Chr12 Total Counts",
    x = "Group",
    y = "Counts (*1000)",
    color = "Group" # Clear legend title
  ) +
  scale_y_continuous(limits = c(0, NA)) +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        legend.position = "bottom")

p <- plot_grid(p1,p2,p3, nrow = 1)
print(p)

Fig1 Sequencing stats of each sample
Code
ggsave('eda_plot/qc.pdf', width = 12, height = 5)

PCA plots using log2CPM

Code
exp <- log2(assays(se1)$CPM + 1) 
exp <- exp[rowSums(exp)!=0, ] # filter for non expressed and liver

# group <- group[-c(9:10), drop=T]

colnames(exp) <- paste(group, c('S1','S2'), sep = '_')  # rename by replicate (S) 

# anno.col <- anno.col[-c(9:10),]

autoplot(prcomp(exp %>% t()), 
         data = anno.col,
         color = 'group', size = 'capture.rate') +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 
autoplot(prcomp(exp %>% t()), 
         data = anno.col,
         color = 'group', size = 'count_chr16') +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 
autoplot(prcomp(exp %>% t()), 
         data = anno.col,
         color = 'group', size = 'count_DNM1L') +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 
autoplot(prcomp(exp %>% t()), 
         data = anno.col,
         color = 'group', size = 4) +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 

Fig2A PCA plot using log2CPM of all transcripts of chr16

Fig2A PCA plot using log2CPM of all transcripts of chr16

Fig2A PCA plot using log2CPM of all transcripts of chr16

Fig2A PCA plot using log2CPM of all transcripts of chr16
Code
exp <- log2(assays(se1)$CPM[rowData(se1)$GENEID == 'Dnm1l',] + 1) 
exp <- exp[rowSums(exp)!=0, ] # filter for non expressed and liver

colnames(exp) <- paste(group, c('S1','S2'), sep = '_') # rename by replicate (S) 

autoplot(prcomp(exp %>% t()), 
         data = anno.col,
         color = 'group', size = 'capture.rate') +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 
autoplot(prcomp(exp %>% t()), 
         data = anno.col,
         color = 'group', size = 'count_chr16') +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 
autoplot(prcomp(exp %>% t()), 
         data = anno.col,
         color = 'group', size = 'count_DNM1L') +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 
autoplot(prcomp(exp %>% t()), 
         data = anno.col,
         color = 'group', size = 4) +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 

Fig2B PCA plot using only log2CPM of DNM1L

Fig2B PCA plot using only log2CPM of DNM1L

Fig2B PCA plot using only log2CPM of DNM1L

Fig2B PCA plot using only log2CPM of DNM1L

Heatmap using log2CPM

Code
plotBambu(se1, type = "annotation", gene_id = "Dnm1l")

Fig3 Bambu output showing the structure and expression of isoforms
[[1]]
TableGrob (3 x 1) "arrange": 3 grobs
  z     cells    name                 grob
1 1 (2-2,1-1) arrange       gtable[layout]
2 2 (3-3,1-1) arrange       gtable[layout]
3 3 (1-1,1-1) arrange text[GRID.text.1464]
Code
# anno.col <- data.frame(capture.rate = capture.rate,
#                        group = group,
#                        library.size = colSums(assays(se1)$counts))
rownames(anno.col) <- colnames(exp)

ann_colors <- list(group = setNames(brewer.pal(6, "Set2"), levels(group)),
                   capture.rate = brewer.pal(n = 9, name = "BuPu"),
                   count_DNM1L = brewer.pal(n = 9, name = "BuGn"))

exp %>%
  pheatmap::pheatmap(cellwidth = 15, cellheight = 10, 
           annotation_col = anno.col[, 4, drop = F] ,
           annotation_colors = ann_colors,
           scale = 'none',
           show_colnames = F,
           main = 'Heatmap using log2(CPM+1)',
           width = 7, height = 15)

Fig4 Heatmap showing isoform expression, clustered by default
Code
exp %>%
  pheatmap::pheatmap(cellwidth = 15, cellheight = 10, 
           annotation_col = anno.col[, 4, drop = F] ,
           annotation_colors = ann_colors,
           scale = 'row',
           show_colnames = F,
           main = 'Heatmap using z-scaled log2(CPM+1)',
           width = 7, height = 15)

Fig4 Heatmap showing isoform expression, clustered by default

Boxplot using log2CPM

Code
# 1) Align groups to exp2 columns (samples)
stopifnot(all(colnames(exp) %in% rownames(anno.col)))

# 2) Long-format: one row per (transcript, sample)
df_long <- as.data.frame(exp) %>%
  rownames_to_column("transcript") %>%
  pivot_longer(-transcript, names_to = "sample", values_to = "log2CPM") %>%
  left_join(anno.col %>% rownames_to_column() %>% select(rowname, group), 
            by = c('sample' = 'rowname')) %>%
  mutate(transcript = factor(transcript, 
                             levels = c("NM_152816.4", "NM_001025947.3", "NM_001276340.2",
                                        "NM_001405252.1", "NM_001276341.2", "NM_001360007.2",
                                        "NM_001360008.2","NM_001360009.2", "NM_001360010.2",
                                        "NM_001405253.1","NM_001405254.1","NM_001405255.1",
                                        "NM_001405256.1","NM_001405257.1","NM_001405258.1",
                                        "NM_001405259.1","NM_001405260.1","NM_001405261.1",
                                        "NM_001405262.1","NM_001405263.1","NM_001405264.1","NM_001405265.1",
                                        "NR_075074.2","NR_175921.1",
                                        "NR_175922.1","NR_175923.1","NR_175924.1","NR_175925.1","NR_175926.1",
                                        "NR_175927.1","NR_175928.1","NR_175929.1","NR_175930.1","NR_175931.1",
                                        "NR_175932.1","NR_175933.1", "XM_006522638.5")))

# 3) Boxplots of expression by group, faceted by transcript (one facet per row of exp2)
ggplot(df_long, aes(x = group, y = log2CPM, fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Transcript expression log2(CPM+1)") +
  theme_classic() +
  theme(legend.position = "none",
        strip.text = element_text(size = 9),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  facet_wrap(~ transcript, scales = "free_y", ncol = 8)  +
    force_panelsizes(rows = unit(2, 'in'),
                   cols = unit(2, 'in')) 

Code
ggplot(df_long, aes(x = transcript, y = log2CPM, fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Transcript expression log2(CPM+1)") +
  theme_classic() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        strip.text = element_text(size = 9)) +
  facet_wrap(~ group, scales = "free_y")  +
    force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 

Only show NM transcripts

Code
# 3) Boxplots of expression by group, faceted by transcript (one facet per row of exp2)
ggplot(df_long %>%
         filter(#transcript %in% c("NM_001025947.3", "NM_001405252.1", "NM_001360007.2", "NM_001405259.1")
                str_detect(transcript, '^NM')), 
       aes(x = group, y = log2CPM, fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Transcript expression log2(CPM+1)") +
  theme_classic() +
  theme(legend.position = "none",
        strip.text = element_text(size = 9),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  facet_wrap(~ transcript, scales = "free_y", ncol = 5)  +
    force_panelsizes(rows = unit(2, 'in'),
                   cols = unit(2, 'in')) 

Code
ggplot(df_long %>%
         filter(#transcript %in% c("NM_001025947.3", "NM_001405252.1", "NM_001360007.2", "NM_001405259.1")
                str_detect(transcript, '^NM')),
       aes(x = transcript, y = log2CPM, fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Transcript expression log2(CPM+1)") +
  theme_classic() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        strip.text = element_text(size = 9)) +
  facet_wrap(~ group, scales = "free_y", , ncol = 3)  +
    force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 

PCA using log2TPM

This is only done because we need to caluclate TSI based on TPM, TPM can eliminate transcript length bias. Though in this data, the transcript length are quite similar, effects are minimal

Code
# Map samples to tissues (assumes colnames(exp2) are sample IDs)
stopifnot(all(colnames(exp) %in% rownames(anno.col)))

## get TPM first

calculateTPM <- function(se = se) {
  
  counts <- assays(se)$counts %>% colSums()
  incompcounts <- data.frame(metadata(se)$incompatibleCounts)[, colnames(se)] %>% colSums()
  totalcounts <- counts + incompcounts
  
  tx_lengths <- rowRanges(se) %>% width() %>% sum
  
  stopifnot(rownames(counts) == names(tx_lengths))
  
  tpm <- assays(se)$counts/tx_lengths*1e3
  sum <- colSums(tpm)
  
  tmp <- tpm/sum*1e6
  return(tpm)

}

tpm <- calculateTPM(se1)
colnames(tpm) <- colnames(exp)

# PCA 
autoplot(prcomp(log2(tpm+1) %>% t()), 
         data = anno.col,
         color = 'group', size = 4) +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) +
  ggtitle('PCA using log2(TPM+1) for chr16 transcripts')

Code
tpm <- tpm[rownames(exp), ]

autoplot(prcomp(log2(tpm+1) %>% t()), 
         data = anno.col,
         color = 'group', size = 4) +
  scale_color_brewer(palette = "Set2") +
  force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) +
  ggtitle('PCA using log2(TPM+1) of Dnm1l transcripts')

Heatmap using log2TPM

Code
log2(tpm + 1) %>%
  pheatmap::pheatmap(cellwidth = 15, cellheight = 10, 
           annotation_col = anno.col[, 4, drop = F] ,
           annotation_colors = ann_colors,
           scale = 'none',
           show_colnames = F,
           main = 'Heatmap using log2(TPM+1)',
           width = 7, height = 15)

Code
log2(tpm + 1) %>%
  pheatmap::pheatmap(cellwidth = 15, cellheight = 10, 
           annotation_col = anno.col[, 4, drop = F] ,
           annotation_colors = ann_colors,
           scale = 'row',
           show_colnames = F,
           main = 'Heatmap using z-scaled log2(TPM+1)',
           width = 7, height = 15)

Boxplot using log2TPM

Code
df_long2 <- as.data.frame(tpm) %>%
  rownames_to_column("transcript") %>%
  pivot_longer(-transcript, names_to = "sample", values_to = "TPM") %>%
  left_join(anno.col %>% rownames_to_column() %>% select(rowname, group), 
            by = c('sample' = 'rowname')) %>%
  mutate(transcript = factor(transcript, 
                             levels = c("NM_152816.4", "NM_001025947.3", "NM_001276340.2",
                                        "NM_001405252.1", "NM_001276341.2", "NM_001360007.2",
                                        "NM_001360008.2","NM_001360009.2", "NM_001360010.2",
                                        "NM_001405253.1","NM_001405254.1","NM_001405255.1",
                                        "NM_001405256.1","NM_001405257.1","NM_001405258.1",
                                        "NM_001405259.1","NM_001405260.1","NM_001405261.1",
                                        "NM_001405262.1","NM_001405263.1","NM_001405264.1","NM_001405265.1",
                                        "NR_075074.2","NR_175921.1",
                                        "NR_175922.1","NR_175923.1","NR_175924.1","NR_175925.1","NR_175926.1",
                                        "NR_175927.1","NR_175928.1","NR_175929.1","NR_175930.1","NR_175931.1",
                                        "NR_175932.1","NR_175933.1", "XM_006522638.5")))

# 3) Boxplots of expression by group, faceted by transcript (one facet per row of exp2)
ggplot(df_long2, aes(x = group, y = log2(TPM+1), fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Transcript expression log2(TPM+1)") +
  theme_classic() +
  theme(legend.position = "none",
        strip.text = element_text(size = 9),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  facet_wrap(~ transcript, scales = "free_y", ncol = 8)  +
    force_panelsizes(rows = unit(2, 'in'),
                   cols = unit(2, 'in')) 

Code
ggplot(df_long2, aes(x = transcript, y = log2(TPM+1), fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Transcript expression log2(TPM+1)") +
  theme_classic() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        strip.text = element_text(size = 9)) +
  facet_wrap(~ group, scales = "free_y")  +
    force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 

Code
# Only show NM transcripts
ggplot(df_long2 %>%
         filter(#transcript %in% c("NM_001025947.3", "NM_001405252.1", "NM_001360007.2", "NM_001405259.1")
                str_detect(transcript, '^NM')), 
       aes(x = group, y = log2(TPM+1), fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Transcript expression log2(TPM+1)") +
  theme_classic() +
  theme(legend.position = "none",
        strip.text = element_text(size = 9),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  facet_wrap(~ transcript, scales = "free_y", ncol = 5)  +
    force_panelsizes(rows = unit(2, 'in'),
                   cols = unit(2, 'in')) 

Code
ggplot(df_long2 %>%
         filter(#transcript %in% c("NM_001025947.3", "NM_001405252.1", "NM_001360007.2", "NM_001405259.1")
                str_detect(transcript, '^NM')),
       aes(x = transcript, y = log2(TPM+1), fill = group)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Transcript expression log2(TPM+1)") +
  theme_classic() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        strip.text = element_text(size = 9)) +
  facet_wrap(~ group, scales = "free_y", , ncol = 3)  +
    force_panelsizes(rows = unit(3, 'in'),
                   cols = unit(3, 'in')) 

Code
# Mean TPM per (transcript, tissue), fill missing tissues with 0 to keep range [1/n, 1]
xi_tbl <- df_long2 %>%
  group_by(transcript, group) %>%
  summarise(xi = mean(TPM, na.rm = TRUE))

# TSI = max(xi) / sum(xi)
tsi_tbl <- xi_tbl %>%
  group_by(transcript) %>%
  summarise(
    sum_x = sum(xi),
    max_x = max(xi),
    TSI   = ifelse(sum_x > 0, max_x / sum_x, NA_real_),
    .groups = "drop"
  ) %>%
  arrange(desc(TSI))

print(tsi_tbl)
# A tibble: 37 × 4
   transcript       sum_x  max_x   TSI
   <fct>            <dbl>  <dbl> <dbl>
 1 NM_001360007.2   804.   743.  0.925
 2 NM_001360008.2 10829.  9661.  0.892
 3 NR_175921.1       33.3   28.4 0.853
 4 NM_001360009.2   441.   346.  0.783
 5 NM_001405254.1  6227.  4855.  0.780
 6 NR_175928.1      129.    99.0 0.766
 7 NR_175925.1      235.   152.  0.649
 8 NM_001405259.1   306.   197.  0.643
 9 NM_152816.4     5905.  3774.  0.639
10 NR_175931.1       75.6   43.5 0.575
# ℹ 27 more rows

Statistical test using log2CPM and log2TPM

Average expression as log2(CPM+1), TPM of each cell type, and ANOVA p values (both logCPM and logTPM).

Code
pvals <- sapply(1:nrow(exp), function(x){
  
  anova(lm(exp[x,]~group))$`Pr(>F)`[1]
  
}, simplify = T)

qvals <- p.adjust(pvals, method = 'BH')

pvals2 <- sapply(1:nrow(tpm), function(x){
  
  anova(lm(log2(tpm+1)[x,]~group))$`Pr(>F)`[1]
  
}, simplify = T)

qvals2 <- p.adjust(pvals2, method = 'BH')

# Initialize an empty matrix to store the results
result <- matrix(NA, nrow = nrow(exp), ncol = length(levels(group)))
colnames(result) <- levels(group)
rownames(result) <- rownames(exp)

# Calculate the mean value in each row for each group
for (i in seq_along(levels(group))) {
  group_samples <- which(group == levels(group)[i])
  result[, i] <- rowMeans(exp[, group_samples], na.rm = TRUE)
}

# Convert the result to a data frame for better readability
result <- as.data.frame(result)

result <- data.frame(result, pvals_lcpm = pvals, FDR_lcpm = qvals, 
                     pvals_ltpm = pvals2, FDR_ltpm = qvals2,  
                     exp, tpm,
                    check.names = FALSE) 

colnames(result)[11:ncol(result)] <- paste(colnames(result)[11:ncol(result)], 
                                 rep(c('log2CPM', 'TPM'), each = 12), 
                                 sep = '_') 

result <- left_join(result %>% rownames_to_column(var = 'transcript'),
                    tsi_tbl, by = 'transcript')

datatable(result[c(1:11, 36:38, 12:35)]) %>%
  formatRound(columns=2:ncol(result), digits=3)
Code
write.csv(result, 'mouse_anova.csv')

Statistical analysis using edgeR QL test

Code
library(edgeR)
dge <- DGEList(counts = assays(se1)$counts, 
               genes = rowData(se1) %>% data.frame(),
               group = group, 
               remove.zeros = T)

y <- normLibSizes(dge)
y$samples
                               group lib.size norm.factors
barcode09_sorted_dedup_merged  Brain   136596    0.7396017
barcode10_sorted_dedup_merged  Brain   156264    1.0161234
barcode11_sorted_dedup_merged   Lung    92574    1.4487393
barcode12_sorted_dedup_merged   Lung    69151    1.1977898
barcode13_sorted_dedup_merged Spleen    94700    2.1119330
barcode14_sorted_dedup_merged Spleen    75260    1.8815252
barcode15_sorted_dedup_merged  Heart   123414    0.7545560
barcode16_sorted_dedup_merged  Heart   184351    0.3465053
barcode19_sorted_dedup_merged Kidney   120279    0.9096561
barcode20_sorted_dedup_merged Kidney   116206    1.1283612
barcode21_sorted_dedup_merged Muscle   131953    0.6440550
barcode22_sorted_dedup_merged Muscle   116573    1.1164644
Code
## edgeR manual section 3.2.6
## ANOVA like test
design <- model.matrix(~group, data = y$samples)
colnames(design)[-1] <- levels(group)[-1]

y <- estimateDisp(y, design, robust = T)
y$common.dispersion
[1] 1.230459
Code
plotBCV(y)

Code
fit <- glmQLFit(y, design, robust = T)
qlf <- glmQLFTest(fit, coef = 2:ncol(design))

deg <- topTags(qlf, n = Inf, p.value = Inf) %>% 
  data.frame() %>%
  filter(GENEID == 'Dnm1l') 

# fold change here is relative to Brain
deg %>% 
  select(c(1:2, 12:ncol(deg))) %>% 
  datatable() %>%
  formatRound(columns=3:11, digits=3)
Code
write.csv(deg[, -11], 'mouse_edger_anovalike.csv') # remove eqClassById list column
Code
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux 9.3 (Plow)

Matrix products: default
BLAS:   /stornext/System/data/software/rhel/9/base/tools/R/4.2.3/lib64/R/lib/libRblas.so
LAPACK: /stornext/System/data/software/rhel/9/base/tools/R/4.2.3/lib64/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] grid      stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] edgeR_4.2.0                 limma_3.52.4               
 [3] RColorBrewer_1.1-3          ComplexHeatmap_2.12.1      
 [5] cowplot_1.1.3               ggh4x_0.2.8                
 [7] DT_0.29                     pheatmap_1.0.12            
 [9] bambu_3.6.0                 BSgenome_1.66.3            
[11] rtracklayer_1.58.0          Biostrings_2.66.0          
[13] XVector_0.38.0              SummarizedExperiment_1.28.0
[15] Biobase_2.58.0              GenomicRanges_1.50.2       
[17] GenomeInfoDb_1.34.9         IRanges_2.32.0             
[19] S4Vectors_0.36.2            BiocGenerics_0.44.0        
[21] MatrixGenerics_1.10.0       matrixStats_1.1.0          
[23] lubridate_1.9.2             forcats_1.0.0              
[25] stringr_1.5.1               dplyr_1.1.4                
[27] purrr_1.0.2                 readr_2.1.4                
[29] tidyr_1.3.1                 tibble_3.2.1               
[31] tidyverse_2.0.0             ggfortify_0.4.16           
[33] ggplot2_3.4.4              

loaded via a namespace (and not attached):
  [1] backports_1.4.1          circlize_0.4.16          Hmisc_5.1-1             
  [4] BiocFileCache_2.11.2     systemfonts_1.0.5        plyr_1.8.9              
  [7] lazyeval_0.2.2           splines_4.2.3            crosstalk_1.2.0         
 [10] BiocParallel_1.32.6      digest_0.6.34            ensembldb_2.20.2        
 [13] foreach_1.5.2            htmltools_0.5.7          fansi_1.0.6             
 [16] magrittr_2.0.3           checkmate_2.3.1          memoise_2.0.1           
 [19] cluster_2.1.4            doParallel_1.0.17        tzdb_0.4.0              
 [22] ggbio_1.44.1             timechange_0.2.0         prettyunits_1.2.0       
 [25] colorspace_2.1-0         blob_1.2.4               rappdirs_0.3.3          
 [28] textshaping_0.3.6        xfun_0.39                crayon_1.5.2            
 [31] RCurl_1.98-1.14          jsonlite_1.8.8           graph_1.74.0            
 [34] VariantAnnotation_1.42.1 iterators_1.0.14         glue_1.7.0              
 [37] gtable_0.3.4             zlibbioc_1.44.0          GetoptLong_1.0.5        
 [40] DelayedArray_0.24.0      shape_1.4.6              scales_1.3.0            
 [43] DBI_1.2.2                GGally_2.1.2             Rcpp_1.0.12             
 [46] progress_1.2.2           htmlTable_2.4.2          clue_0.3-65             
 [49] foreign_0.8-84           bit_4.0.5                OrganismDbi_1.38.1      
 [52] Formula_1.2-5            htmlwidgets_1.6.2        httr_1.4.7              
 [55] ellipsis_0.3.2           pkgconfig_2.0.3          reshape_0.8.9           
 [58] XML_3.99-0.14            farver_2.1.1             sass_0.4.7              
 [61] nnet_7.3-18              dbplyr_2.5.0             locfit_1.5-9.8          
 [64] utf8_1.2.4               tidyselect_1.2.1         labeling_0.4.3          
 [67] rlang_1.1.3              reshape2_1.4.4           AnnotationDbi_1.60.2    
 [70] munsell_0.5.0            tools_4.2.3              cachem_1.0.8            
 [73] xgboost_1.7.5.1          cli_3.6.2                generics_0.1.3          
 [76] RSQLite_2.3.6            evaluate_0.23            fastmap_1.1.1           
 [79] yaml_2.3.7               knitr_1.43               bit64_4.0.5             
 [82] AnnotationFilter_1.20.0  KEGGREST_1.38.0          RBGL_1.72.0             
 [85] xml2_1.3.5               biomaRt_2.54.1           compiler_4.2.3          
 [88] rstudioapi_0.15.0        filelock_1.0.3           curl_5.2.1              
 [91] png_0.1-8                statmod_1.5.0            bslib_0.5.0             
 [94] stringi_1.8.3            GenomicFeatures_1.50.4   lattice_0.20-45         
 [97] ProtGenerics_1.28.0      Matrix_1.6-5             vctrs_0.6.5             
[100] pillar_1.9.0             lifecycle_1.0.4          BiocManager_1.30.22     
[103] jquerylib_0.1.4          GlobalOptions_0.1.2      data.table_1.15.0       
[106] bitops_1.0-8             R6_2.5.1                 BiocIO_1.8.0            
[109] gridExtra_2.3            codetools_0.2-19         dichromat_2.0-0.1       
[112] rjson_0.2.21             withr_3.0.0              GenomicAlignments_1.34.1
[115] Rsamtools_2.14.0         GenomeInfoDbData_1.2.9   parallel_4.2.3          
[118] hms_1.1.3                rpart_4.1.19             rmarkdown_2.23          
[121] biovizBase_1.44.0        base64enc_0.1-3          restfulr_0.0.15